Chapter 5 Community composition

5.1 Filter data

load("data/data.Rdata")

Filter samples with high host data

sample_metadata <- sample_metadata %>%
  filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))

genome_counts_filt <- genome_counts %>%
  select(one_of(c("genome",sample_metadata$sample)))%>%
  filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
  select_if(~!all(. == 0))
genome_counts <- genome_counts_filt
genome_metadata <- genome_metadata %>% 
  semi_join(., genome_counts_filt, by = "genome") %>% 
  arrange(match(genome,genome_counts_filt$genome))

genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips

#load("data/genome_gifts.Rdata")

Make a phyloseq object

phylo_samples <- sample_metadata %>% 
  column_to_rownames("sample") %>% 
  sample_data() #convert to phyloseq sample_data object
phylo_genome <- genome_counts_filt %>% 
  column_to_rownames("genome") %>% 
  otu_table(., taxa_are_rows = TRUE)
phylo_taxonomy <- genome_metadata %>%
  column_to_rownames("genome") %>% 
  as.matrix() %>% 
  tax_table() #convert to phyloseq tax_table object
phylo_tree <- phy_tree(genome_tree) 

physeq_genome <- phyloseq(phylo_genome, phylo_taxonomy, phylo_samples,phylo_tree)
physeq_genome_clr <- microbiome::transform(physeq_genome, 'clr')
save(sample_metadata, 
     genome_metadata, 
     read_counts, 
     genome_counts, 
     genome_counts_filt, 
     genome_tree,
     physeq_genome,
     physeq_genome_clr,
     genome_gifts_raw, 
#     genome_gifts,
     phylum_colors,
     diet_colors,
     file = "data/data_host_filtered.Rdata")

5.2 Load data

load("data/data_host_filtered.Rdata")

5.3 Taxonomy overview

5.3.1 Stacked barplot

genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS normalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  left_join(., sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  filter(count > 0) %>% #filter 0 counts
  ggplot(., aes(x=sample,y=count, fill=phylum, group=phylum)) + #grouping enables keeping the same sorting of taxonomic units
    geom_bar(stat="identity", colour="white", linewidth=0.1) + #plot stacked bars with white borders
    scale_fill_manual(values=phylum_colors) +
  facet_grid(~ diet, scale="free", space = "free")+
 #   facet_nested(. ~ region+diet,  scales="free") + #facet per day and treatment
    guides(fill = guide_legend(ncol = 1)) +
    theme(
          axis.title.x = element_blank(),
          panel.background = element_blank(),
          panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          strip.background = element_rect(fill = "white"),
    strip.text = element_text(size = 12, lineheight = 0.6),
    strip.placement = "outside",
    axis.text.x = element_blank(), axis.ticks.x = element_blank(),
          axis.line = element_line(linewidth = 0.5, linetype = "solid", colour = "black")) +
   labs(fill="Phylum",y = "Relative abundance",x="Samples")

Number of bacteria phyla

[1] 14

Bacteria phyla in wild individuals

[1] 14

Bacteria phyla captive animals

[1] 14

Bacteria phyla before grass is included in the diet

[1] 14

Bacteria phyla after grass is included in the diet

[1] 14

Number of Archaea phyla

[1] 1

Archaea phyla in wild individuals

[1] 0

Archaea phyla before grass is included in the diet

[1] "p__Methanobacteriota"

Archaea phyla after grass is included in the diet

[1] "p__Methanobacteriota"

5.3.2 Genus and species annotation

Number of MAGs without species-level annotation

[1] 749
phylum count_nospecies count_total percentage
p__Actinomycetota 15 15 100.00000
p__Bacillota 52 53 98.11321
p__Bacillota_A 516 526 98.09886
p__Bacillota_B 2 2 100.00000
p__Bacillota_C 6 9 66.66667
p__Bacteroidota 43 127 33.85827
p__Campylobacterota 1 1 100.00000
p__Cyanobacteriota 6 7 85.71429
p__Desulfobacterota 12 12 100.00000
p__Patescibacteria 13 13 100.00000
p__Pseudomonadota 32 39 82.05128
p__Spirochaetota 2 2 100.00000
p__Synergistota 18 18 100.00000
p__Verrucomicrobiota 31 35 88.57143

Percentage of MAGs without species-level annotation

[1] 87.09302

Number of MAGs without genera-level annotation

79

5.3.3 Phylum relative abundances

phylum_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>%
  left_join(sample_metadata, by = join_by(sample == sample)) %>%
  left_join(genome_metadata, by = join_by(genome == genome)) %>%
  group_by(sample,phylum,region, diet) %>%
  summarise(relabun=sum(count))
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=mean(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    filter(phylum %in% phylum_arrange) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange))) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=phylum_colors[rev(phylum_arrange)]) +
        geom_jitter(alpha=0.5) + 
        theme_minimal() + 
        theme(legend.position="none") +
        labs(y="Phylum",x="Relative abundance")

5.3.3.1 Origin: Wild vs Captive

phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
              Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
              Captive_mean=mean(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Captive_sd=sd(relabun[diet=="Pre_grass"]*100, na.rm=T)) %>%
    mutate(total=str_c(round(total_mean,3),"±",round(total_sd,3)),
           Wild=str_c(round(Wild_mean,3),"±",round(Wild_sd,3)),
           Captive=str_c(round(Captive_mean,3),"±",round(Captive_sd,3))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,total,Wild,Captive)
# A tibble: 15 × 4
   phylum               total         Wild          Captive      
   <chr>                <chr>         <chr>         <chr>        
 1 p__Bacillota_A       57.243±12.618 48.636±13.52  59.846±10.725
 2 p__Bacteroidota      25.787±10.113 26.889±11.805 24.986±10.083
 3 p__Bacillota         4.118±7.202   5.066±11.152  4.949±5.297  
 4 p__Spirochaetota     3.911±10.011  11.731±14.792 0.001±0.001  
 5 p__Verrucomicrobiota 2.264±5.722   1.472±0.715   1.449±1.295  
 6 p__Pseudomonadota    1.626±3.328   0.511±0.397   3.082±5.382  
 7 p__Patescibacteria   0.987±1.542   0.212±0.253   2.177±2.223  
 8 p__Synergistota      0.96±0.852    1.865±0.83    0.45±0.39    
 9 p__Bacillota_C       0.844±0.795   1.696±0.674   0.386±0.36   
10 p__Actinomycetota    0.78±0.985    0.427±0.459   1.217±1.147  
11 p__Cyanobacteriota   0.708±2.081   0.277±0.614   0.916±3.024  
12 p__Desulfobacterota  0.586±0.396   0.991±0.315   0.41±0.256   
13 p__Bacillota_B       0.104±0.145   0.228±0.203   0.042±0.024  
14 p__Methanobacteriota 0.082±0.217   0±0           0.09±0.228   
15 p__Campylobacterota  0±0           0±0           0±0          

5.3.3.2 Origin and diet

phylum_summary %>%
    group_by(phylum) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
              Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
              Pre_grass_mean=mean(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Pre_grass_sd=sd(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Post_grass_mean=mean(relabun[diet=="Post_grass"]*100, na.rm=T),
              Post_grass_sd=sd(relabun[diet=="Post_grass"]*100, na.rm=T))  %>%
    mutate(total=str_c(round(total_mean,2),"±",round(total_sd,2)),
           Wild=str_c(round(Wild_mean,2),"±",round(Wild_sd,2)),
           Pre_grass=str_c(round(Pre_grass_mean,6),"±",round(Pre_grass_sd,6)),
           Post_grass=str_c(round(Post_grass_mean,2),"±",round(Post_grass_sd,2))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(phylum,total,Wild,Pre_grass,Post_grass)
# A tibble: 15 × 5
   phylum               total       Wild        Pre_grass           Post_grass
   <chr>                <chr>       <chr>       <chr>               <chr>     
 1 p__Bacillota_A       57.24±12.62 48.64±13.52 59.845932±10.725422 63.25±9.01
 2 p__Bacteroidota      25.79±10.11 26.89±11.81 24.986055±10.082621 25.49±9.07
 3 p__Bacillota         4.12±7.2    5.07±11.15  4.94862±5.296738    2.34±2.73 
 4 p__Spirochaetota     3.91±10.01  11.73±14.79 0.00063±0.000727    0±0       
 5 p__Verrucomicrobiota 2.26±5.72   1.47±0.71   1.448674±1.294628   3.87±9.89 
 6 p__Pseudomonadota    1.63±3.33   0.51±0.4    3.082488±5.382343   1.29±1.52 
 7 p__Patescibacteria   0.99±1.54   0.21±0.25   2.176993±2.223404   0.57±0.41 
 8 p__Synergistota      0.96±0.85   1.86±0.83   0.449802±0.390105   0.57±0.35 
 9 p__Bacillota_C       0.84±0.79   1.7±0.67    0.385614±0.359883   0.45±0.49 
10 p__Actinomycetota    0.78±0.99   0.43±0.46   1.217389±1.147372   0.7±1.1   
11 p__Cyanobacteriota   0.71±2.08   0.28±0.61   0.915635±3.024426   0.93±1.99 
12 p__Desulfobacterota  0.59±0.4    0.99±0.31   0.410146±0.255958   0.36±0.25 
13 p__Bacillota_B       0.1±0.15    0.23±0.2    0.041532±0.023962   0.04±0.02 
14 p__Methanobacteriota 0.08±0.22   0±0         0.090366±0.227643   0.15±0.29 
15 p__Campylobacterota  0±0         0±0         0.000127±0.000226   0±0       
phylum_arrange <- phylum_summary %>%
    group_by(phylum) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(phylum) %>%
    pull()

phylum_summary %>%
    left_join(genome_metadata %>% select(phylum,phylum) %>% unique(),by=join_by(phylum==phylum)) %>%
#    left_join(sample_metadata,by=join_by(sample==sample)) %>%
    filter(phylum %in% phylum_arrange[1:20]) %>%
    mutate(phylum=factor(phylum,levels=rev(phylum_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=phylum, group=phylum, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~diet)+
        theme_minimal() + 
        labs(y="phylum", x="Relative abundance", color="Phylum")

5.4 Taxonomy boxplot

5.4.1 Family

family_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  left_join(., genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,family, diet,region) %>%
  summarise(relabun=sum(count))
family_summary$diet <- factor(family_summary$diet, levels=c("Pre_grass", "Post_grass", "Wild"))

5.4.1.1 Origin: Wild vs Captive

family_summary %>%
    group_by(family) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
              Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
              Cap_mean=mean(relabun[region=="Nafarroa"]*100, na.rm=T),
              Cap_sd=sd(relabun[region=="Nafarroa"]*100, na.rm=T))  %>%
    mutate(Total=str_c(round(total_mean,2),"±",round(total_sd,2)),
           Wild=str_c(round(Wild_mean,2),"±",round(Wild_sd,2)),
           Captive=str_c(round(Cap_mean,2),"±",round(Cap_sd,2))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(family,Total,Wild,Captive) %>% 
    paged_table()

5.4.1.2 Origin and Diet

family_summary %>%
    group_by(family) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
              Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
              Pre_grass_mean=mean(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Pre_grass_sd=sd(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Post_grass_mean=mean(relabun[diet=="Post_grass"]*100, na.rm=T),
              Post_grass_sd=sd(relabun[diet=="Post_grass"]*100, na.rm=T))  %>%
    mutate(Total=str_c(round(total_mean,2),"±",round(total_sd,2)),
           Wild=str_c(round(Wild_mean,2),"±",round(Wild_sd,2)),
           Pre_grass=str_c(round(Pre_grass_mean,2),"±",round(Pre_grass_sd,2)),
           Post_grass=str_c(round(Post_grass_mean,2),"±",round(Post_grass_sd,2))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(family,Total,Wild,Pre_grass,Post_grass) %>% 
    paged_table()
family_arrange <- family_summary %>%
    group_by(family) %>%
    summarise(mean=sum(relabun)) %>%
    arrange(-mean) %>%
    select(family) %>%
    pull()

family_summary %>%
    left_join(genome_metadata %>% select(family,phylum) %>% unique(),by=join_by(family==family)) %>%
    filter(family %in% family_arrange[1:20]) %>%
    mutate(family=factor(family,levels=rev(family_arrange[1:20]))) %>%
    filter(relabun > 0) %>%
    ggplot(aes(x=relabun, y=family, group=family, color=phylum)) +
        scale_color_manual(values=phylum_colors[-8]) +
        geom_jitter(alpha=0.5) + 
        facet_grid(.~diet)+
        theme_minimal() + 
        labs(y="Family", x="Relative abundance", color="Phylum")

5.4.2 Genus

genus_summary <- genome_counts_filt %>%
  mutate_at(vars(-genome),~./sum(.)) %>% #apply TSS nornalisation
  pivot_longer(-genome, names_to = "sample", values_to = "count") %>% #reduce to minimum number of columns
  left_join(sample_metadata, by = join_by(sample == sample)) %>% #append sample metadata
  left_join(genome_metadata, by = join_by(genome == genome)) %>% #append genome metadata
  group_by(sample,phylum,genus, diet) %>%
  summarise(relabun=sum(count)) %>%
  filter(genus != "g__") %>%
  mutate(genus= sub("^g__", "", genus))
genus_summary$diet <- factor(genus_summary$diet, levels=c("Pre_grass", "Post_grass", "Wild"))

5.4.3 origin and diet

genus_summary %>%
    group_by(genus) %>%
    summarise(total_mean=mean(relabun*100, na.rm=T),
              total_sd=sd(relabun*100, na.rm=T),
              Wild_mean=mean(relabun[diet=="Wild"]*100, na.rm=T),
              Wild_sd=sd(relabun[diet=="Wild"]*100, na.rm=T),
              Pre_grass_mean=mean(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Pre_grass_sd=sd(relabun[diet=="Pre_grass"]*100, na.rm=T),
              Post_grass_mean=mean(relabun[diet=="Post_grass"]*100, na.rm=T),
              Post_grass_sd=sd(relabun[diet=="Post_grass"]*100, na.rm=T))  %>%
    mutate(Total=str_c(round(total_mean,2),"±",round(total_sd,2)),
           Wild=str_c(round(Wild_mean,2),"±",round(Wild_sd,2)),
           Pre_grass=str_c(round(Pre_grass_mean,2),"±",round(Pre_grass_sd,2)),
           Post_grass=str_c(round(Post_grass_mean,2),"±",round(Post_grass_sd,2))) %>% 
    arrange(-total_mean) %>% 
    dplyr::select(genus,Total,Wild,Pre_grass,Post_grass) %>% 
    paged_table()
genus_arrange <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=sum(relabun)) %>%
    filter(genus != "g__")%>%
    arrange(-mean) %>%
    select(genus) %>%
    mutate(genus= sub("^g__", "", genus)) %>%
    pull()

genus_summary_sort <- genus_summary %>%
    group_by(genus) %>%
    summarise(mean=mean(relabun, na.rm=T),sd=sd(relabun, na.rm=T)) %>%
    arrange(-mean) 

genus_summary %>%
  mutate(genus=factor(genus, levels=rev(genus_summary_sort %>% pull(genus)))) %>%
  filter(relabun > 0) %>%
  ggplot(aes(x=relabun, y=genus, group=genus, color=phylum)) +
  scale_color_manual(values=phylum_colors) +
  geom_jitter(alpha=0.5) + 
  facet_grid(.~diet)+
  theme_minimal() + 
  theme(axis.text.y = element_text(size=6))+
  labs(y="Family", x="Relative abundance", color="Phylum")